To predict the concrete strength using the data available in file concrete_data.xls. Apply feature engineering and model tuning to obtain 80% to 95% of R2score.
import warnings
warnings.filterwarnings('ignore')
## importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import metrics
%matplotlib inline
#from sklearn.linear_model import Ridge
#from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# importing data
df = pd.read_csv("concrete.csv")
df.head()
df.shape
df.dtypes
# Statistical summary
df.describe(percentiles=[.25,.5,.75,.90,.95,.99]).transpose()
df.median()
# check for null values
df.isna().sum()
# you can also find it using the snippet below
df.isnull().apply(pd.value_counts)
# using subplot(), use a for loop
plt.figure(figsize= (20,10)) # setting the figure size
pos = 1 # we will use this variable to index each of the plots
for i in df.columns:
plt.subplot(3, 3, pos)
sns.boxplot( df[i]);
pos += 1 # to plot over the grid one by one
# Removing outliers using IQR method
df["slag"] = np.where(df["slag"] <df['slag'].quantile(0.10), df['slag'].quantile(0.10),df['slag'])
df["slag"] = np.where(df["slag"] >df['slag'].quantile(0.90), df['slag'].quantile(0.90),df['slag'])
# Removing outliers using IQR method
df["water"] = np.where(df["water"] <df['water'].quantile(0.10), df['water'].quantile(0.10),df['water'])
df["water"] = np.where(df["water"] >df['water'].quantile(0.90), df['water'].quantile(0.90),df['water'])
# Removing outliers using IQR method
df["superplastic"] = np.where(df["superplastic"] <df['superplastic'].quantile(0.10), df['superplastic'].quantile(0.10),df['superplastic'])
df["superplastic"] = np.where(df["superplastic"] >df['superplastic'].quantile(0.90), df['superplastic'].quantile(0.90),df['superplastic'])
# Removing outliers using IQR method
df["fineagg"] = np.where(df["fineagg"] <df['fineagg'].quantile(0.10), df['fineagg'].quantile(0.10),df['fineagg'])
df["fineagg"] = np.where(df["fineagg"] >df['fineagg'].quantile(0.90), df['fineagg'].quantile(0.90),df['fineagg'])
# Removing outliers using IQR method
df["age"] = np.where(df["age"] <df['age'].quantile(0.10), df['age'].quantile(0.10),df['age'])
df["age"] = np.where(df["age"] >df['age'].quantile(0.90), df['age'].quantile(0.90),df['age'])
# Removing outliers using IQR method
df["strength"] = np.where(df["strength"] <df['strength'].quantile(0.10), df['strength'].quantile(0.10),df['strength'])
df["strength"] = np.where(df["strength"] >df['strength'].quantile(0.90), df['strength'].quantile(0.90),df['strength'])
sns.pairplot(df[['strength','cement', 'slag', 'ash', 'water', 'superplastic','fineagg', 'age']], diag_kind='kde', height=3, aspect=1)
# Let's see the correlation matrix
plt.figure(figsize = (20, 10))
sns.heatmap(df.corr(), annot=True);
# subplot() with for loop
plt.figure(figsize= (20,10)) # Figure size
pos = 1 # Index
for i in df.columns:
plt.subplot(3, 3, pos)
sns.lineplot(x=i,y='strength', data=df.head() )
pos += 1 # grid plot
## Define X and Y variables
X_orig = df.drop('strength', axis=1)
y_orig = df[['strength']]
# Here all dependent variable are scaled, independent variable is not necessary to be scaled
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cols_to_scale = ['cement', 'slag', 'ash', 'water', 'superplastic', 'coarseagg', 'fineagg', 'age', 'strength']
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale].to_numpy())
df.head()
## Define X and Y variables
X = df.drop('strength', axis=1)
y = df[['strength']]
# Splitting the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)
import statsmodels.api as sm # Importing statsmodels
# create a first fitted model
lm_1 = sm.OLS(y_train,X_train).fit()
#Let's see the summary of our first linear model
print(lm_1.summary())
X_train = X_train.drop('coarseagg', axis=1)
X_train = X_train.drop('fineagg', axis=1)
X_test = X_test.drop('coarseagg', axis=1)
X_test = X_test.drop('fineagg', axis=1)
lm_2 = sm.OLS(y_train,X_train).fit()
print(lm_2.summary()) #Inferential statistics
## Define X and Y variables
X = df.drop(['strength', 'coarseagg', 'fineagg'], axis=1)
y = df[['strength']]
## Define X and Y variables
X_orig = X_orig.drop(['coarseagg', 'fineagg'], axis=1)
# Splitting the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)
#Linear Regression with only feedback columns
from sklearn.linear_model import LinearRegression #importing linear regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
lr = LinearRegression()
lr.fit(X_train, y_train)
pred = lr.predict(X_test) # Predictions from linear regression
tr_lin = lr.score(X_train, y_train)
print("Training Accuracy of Simple Linear Regression is", tr_lin)
r2_lin = r2_score(y_test, pred)
print("R2 Score for Simple Linear Regression is", r2_lin)
data_train = pd.concat([X_train, y_train], axis=1)
data_train.head()
mse = np.mean((lr.predict(X_test)-y_test)**2)
import math
math.sqrt(mse)
fig, ax1 = plt.subplots(1,1, figsize=(12,4))
ax1.scatter(pred, y_test, s=20)
ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
ax1.set_ylabel("True")
ax1.set_xlabel("Predicted")
ax1.set_title("Linear Regression")
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree = 2, interaction_only=True)
X_poly = poly.fit_transform(X)
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(X_poly, y, train_size = 0.70, test_size=0.30, random_state=1)
X_train.shape
lr.fit(X_train_poly,y_train_poly)
ypoly_pred = lr.predict(X_test_poly)
poly_score = lr.score(X_train_poly, y_train_poly)
print("Training Accuracy with polynomial features is", poly_score)
acc = r2_score(y_test_poly,ypoly_pred)
print("R2 score by adding polynomial features is: ", acc)
print(lr.coef_[0])
fig, ax1 = plt.subplots(1,1, figsize=(12,4))
ax1.scatter(ypoly_pred, y_test_poly, s=20)
ax1.plot([y_test_poly.min(), y_test_poly.max()], [y_test_poly.min(), y_test_poly.max()], 'k--', lw=2)
ax1.set_ylabel("True")
ax1.set_xlabel("Predicted")
ax1.set_title("Linear Regression")
mse = np.mean((ypoly_pred-y_test)**2)
math.sqrt(mse)
# Test and train data split
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X_orig, y_orig, train_size=0.7, test_size=0.3, random_state=100)
Since scaling has to be done inside the K Fold cross validation step to avoid the data leakage, a pipeline is constructed and passed to K Fold corssvalidation as a parameter
num_folds = 5
seed = 2
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
clf = LinearRegression()
class Scaler(StandardScaler):
def transformation(self, X_poly, y = None):
output = super.fit(X_poly,y)
output = super().transform(X_poly)
return output
pipeline = Pipeline([('sc', Scaler()), ('estimator', clf)])
lin_cv = KFold(n_splits=num_folds, shuffle = True, random_state=seed)
result = cross_val_score(pipeline, X_train, y_train, scoring = 'r2', cv = lin_cv)
result
print("Accuracy: %.3f%% (%.3f%%)" % (result.mean()*100.0, result.std()*100))
resultsDf = pd.DataFrame({'K-Fold CV':['LinearRegression'], 'accuracy': [result.mean()*100.0], 'deviation(%)': [result.std()*100]}, index=[1])
resultsDf = resultsDf[['K-Fold CV', 'accuracy', 'deviation(%)']]
resultsDf
from sklearn.tree import DecisionTreeRegressor
dcc = DecisionTreeRegressor()
pipeline = Pipeline([('sc', Scaler()), ('estimator', dcc)])
dt_cv = KFold(n_splits=num_folds, shuffle = True, random_state=seed)
dcc_result = cross_val_score(pipeline, X_train, y_train, cv = dt_cv)
dcc_result
print("Accuracy: %.3f%% (%.3f%%)" % (dcc_result.mean()*100.0, dcc_result.std()*100))
tempResultsDf = pd.DataFrame({'K-Fold CV':['DecissionTrees'], 'accuracy': [dcc_result.mean()*100.0], 'deviation(%)': [dcc_result.std()*100]}, index=[2])
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['K-Fold CV', 'accuracy', 'deviation(%)']]
resultsDf
from sklearn.ensemble import RandomForestRegressor
rfc = RandomForestRegressor(n_estimators=100)
pipeline = Pipeline([('sc', Scaler()), ('estimator', rfc)])
rf_cv = KFold(n_splits=num_folds, shuffle = True, random_state=seed)
rfc_result = cross_val_score(pipeline, X_train, y_train, cv = rf_cv)
rfc_result
print("Accuracy: %.3f%% (%.3f%%)" % (rfc_result.mean()*100.0, rfc_result.std()*100))
tempResultsDf = pd.DataFrame({'K-Fold CV':['RandomForests'], 'accuracy': [rfc_result.mean()*100.0], 'deviation(%)': [rfc_result.std()*100]}, index=[3])
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['K-Fold CV', 'accuracy', 'deviation(%)']]
resultsDf
from sklearn.ensemble import BaggingRegressor
bgr = BaggingRegressor(n_estimators=100)
pipeline = Pipeline([('sc', Scaler()), ('estimator', bgr)])
br_cv = KFold(n_splits=num_folds, shuffle = True, random_state=seed)
bgr_result = cross_val_score(pipeline, X_orig, y_orig, cv = br_cv)
bgr_result
print("Accuracy: %.3f%% (%.3f%%)" % (bgr_result.mean()*100.0, bgr_result.std()*100))
tempResultsDf = pd.DataFrame({'K-Fold CV':['Bagging'], 'accuracy': [bgr_result.mean()*100.0], 'deviation(%)': [bgr_result.std()*100]}, index=[4])
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['K-Fold CV', 'accuracy', 'deviation(%)']]
resultsDf
from sklearn.ensemble import GradientBoostingRegressor
abr = GradientBoostingRegressor(n_estimators=100)
pipeline = Pipeline([('sc', Scaler()), ('estimator', abr)])
bor_cv = KFold(n_splits=num_folds, shuffle = True, random_state=seed)
gbr_result = cross_val_score(pipeline, X_orig, y_orig, cv = bor_cv)
gbr_result
print("Accuracy: %.3f%% (%.3f%%)" % (gbr_result.mean()*100.0, gbr_result.std()*100))
tempResultsDf = pd.DataFrame({'K-Fold CV':['Boosting'], 'accuracy': [gbr_result.mean()*100.0], 'deviation(%)': [gbr_result.std()*100]}, index=[5])
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['K-Fold CV', 'accuracy', 'deviation(%)']]
resultsDf
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
lin_gc = KFold(n_splits=num_folds, shuffle = True, random_state=seed)
hyper_params = [{'n_features_to_select': list(range(1, 9))}]
lm = LinearRegression()
lm.fit(X_train_poly, y_train_poly)
rfe = RFE(lm)
model_cv = GridSearchCV(estimator = rfe,
param_grid = hyper_params,
scoring= 'r2',
cv = lin_gc,
verbose = 1,
return_train_score=True)
model_cv.fit(X_train_poly, y_train_poly)
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results
# plotting cv results
plt.figure(figsize=(16,6))
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_test_score"])
plt.plot(cv_results["param_n_features_to_select"], cv_results["mean_train_score"])
plt.xlabel('number of features')
plt.ylabel('r-squared')
plt.title("Optimal Number of Features")
plt.legend(['test score', 'train score'], loc='upper left')
# final model
n_features_optimal = 10
lm = LinearRegression()
lm.fit(X_train_poly, y_train_poly)
rfe = RFE(lm, n_features_to_select=n_features_optimal)
rfe = rfe.fit(X_train_poly, y_train_poly)
y_pred = lm.predict(X_test_poly)
r2 = r2_score(y_test_poly, y_pred)
print("The best R2 score from GridSeach is: ",r2)
hyperDf = pd.DataFrame({'HyperParameterTuning':['LinearRegression'],'Method':['GridSearchCV'], 'Accuracy(%)': [r2*100]}, index=[1])
hyperDf = hyperDf[['HyperParameterTuning', 'Method', 'Accuracy(%)']]
hyperDf
X_train, X_test, y_train, y_test = train_test_split(X_orig, y_orig, train_size=0.7, test_size=0.3, random_state=100)
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint
from sklearn.metrics import mean_squared_error
rf = RandomForestRegressor(n_estimators= 100)
# Number of trees in random forest
#n_estimators = ['n_estimators': range(100, 1500, 400)]
# Number of features to consider at every split
#max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
#max_depth = ['max_depth': range(2, 20, 5)]
# Minimum number of samples required to split a node
#min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
#min_samples_leaf = [1, 2, 5, 10]
# Method of selecting samples for training each tree
# bootstrap = [True, False]
# Create the random grid
random_grid = {'max_features': [1, 4, 7, 9, 12, 16, 22]}
rf_results = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1, return_train_score = True)
fitted_model = rf_results.fit(X_train, y_train)
print(rf_results.best_params_)
rfc1 = RandomForestRegressor(max_features= 4)
rfc1.fit(X_train, y_train)
y_pred_rfr = rfc1.predict(X_test)
test_acc = rfc1.score(X_test, y_test)
print(f"Random forest Random search acccuracy score: {test_acc}")
random_grid = {'n_estimators': range(100, 1500, 400)}
rf_results = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1, return_train_score = True)
fitted_model = rf_results.fit(X_train, y_train)
print(rf_results.best_params_)
rfc1 = RandomForestRegressor(n_estimators= 100, max_features= 4)
rfc1.fit(X_train, y_train)
y_pred_rfr = rfc1.predict(X_test)
test_acc = rfc1.score(X_test, y_test)
print(f"Random forest Random search acccuracy score: {test_acc}")
random_grid = {'min_samples_leaf': range(1, 10, 1)}
rf_results = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1, return_train_score = True)
fitted_model = rf_results.fit(X_train, y_train)
print(rf_results.best_params_)
rfc1 = RandomForestRegressor(n_estimators= 100, max_features= 4, min_samples_leaf= 1)
rfc1.fit(X_train, y_train)
y_pred_rfr = rfc1.predict(X_test)
test_acc = rfc1.score(X_test, y_test)
print(f"Random forest Random search acccuracy score: {test_acc}")
random_grid = {'min_samples_split': range(1, 10, 1)}
rf_results = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1, return_train_score = True)
fitted_model = rf_results.fit(X_train, y_train)
print(rf_results.best_params_)
rfc1 = RandomForestRegressor(n_estimators= 100, max_features= 4, min_samples_leaf= 1, min_samples_split= 2)
rfc1.fit(X_train, y_train)
y_pred_rfr = rfc1.predict(X_test)
test_acc = rfc1.score(X_test, y_test)
print(f"Random forest Random search acccuracy score: {test_acc}")
random_grid = {'max_depth': [4,8,10,15]}
rf_results = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1, return_train_score = True)
fitted_model = rf_results.fit(X_train, y_train)
print(rf_results.best_params_)
rfc1 = RandomForestRegressor(n_estimators= 100, max_features= 4, min_samples_leaf= 1, min_samples_split= 2, max_depth= 15)
rfc1.fit(X_train, y_train)
y_pred_rfr = rfc1.predict(X_test)
test_acc = rfc1.score(X_test, y_test)
print(f"Random forest Random search acccuracy score: {test_acc}")
tempResultsDf = pd.DataFrame({'HyperParameterTuning':['RandomForests'],'Method':['GridSearchCV'], 'Accuracy(%)': [test_acc*100]}, index=[2])
hyperDf = pd.concat([hyperDf, tempResultsDf])
hyperDf = hyperDf[['HyperParameterTuning', 'Method', 'Accuracy(%)']]
hyperDf